package com.souchisouwan.example;

import java.util.ArrayList;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.souchisouwan.util.HttpUtil;

/**
 * 抓取大众点评网区域省份城市信息
 * @author Administrator
 *
 */
public class JsoupAction {
	public static void main(String[] args)throws Exception {
		String uri = "http://www.dianping.com/citylist";
		
		
		String html = HttpUtil.httpPost(uri);
		//System.out.println(html);
		
		//抓取地区
		Document doc = Jsoup.parse(html);//解析HTML字符串返回一个Document实现
		Elements links = doc.select(".root"); 
		for(Element link:links){
			String area = link.select(".vocabulary").html();	//获得地区
			Elements shenfens = link.select(".terms");	//获得省份
			System.out.println(area);	//打印地区
			for(Element sf:shenfens){
				String shenfen = sf.select("dt").html();
				System.out.println("\t"+shenfen);	//打印省份
				
				Elements shiqus = sf.select("dd>a");
				for(Element sq:shiqus){
					String sqtemp = sq.html();
					String pytemp = sq.attr("href").substring(1);
					System.out.println("\t\t"+(sqtemp.indexOf("strong")==-1?sqtemp:sqtemp.substring(8,sqtemp.length()-9))+" "+pytemp);
				}
				
				//获取更多
				String gengduo = sf.select("dd>span").last().attr("data-v");
				List<String> gengduos = more(gengduo);
				for(String gd:gengduos){
					System.out.println("\t\t"+gd);
				}
			}
			
			//System.out.println(link.toString());
		}
	}
	
	//获取更多操作
	public static List<String> more(String index)throws Exception{
		List<String> result = new ArrayList<String>();
		String uri = "http://www.dianping.com/ajax/json/index/citylist/getCitylist?_nr_force=122&do=getbyprovince&pID="+index;
		
		String html = HttpUtil.httpPost(uri);
		
		Document doc = Jsoup.parse(html);//解析HTML字符串返回一个Document实现
		Elements links = doc.select("a");
		for(Element link:links){
			String temp = link.html();
			String pytemp = link.attr("href");
			result.add(temp.substring(0,temp.indexOf("&")));
		}
		return result;
	}
}
